suppressPackageStartupMessages({
library(epiwraps)
library(rtracklayer)
library(GenomicRanges)
library(ComplexHeatmap)
library(chromVAR)
library(motifmatchr)
library(memes)
library(MotifDb)
library(universalmotif)
library(TFBSTools)
library(AnnotationHub)
library(RColorBrewer)
library(viridis)
library(viridisLite)
library(ggrepel)
library(limma)
library(cowplot)
library(edgeR)
library(SummarizedExperiment)
library(SEtools)
library(sechm)
library(edgeR)
library(ggplot2)
library(grid)
library(cowplot)
library(rGREAT)
library(topGO)
library(fgsea)
library(viper)
library(knitr)
})
## Possible Ensembl SSL connectivity problems detected.
## Please see the 'Connection Troubleshooting' section of the biomaRt vignette
## vignette('accessing_ensembl', package = 'biomaRt')Error in curl::curl_fetch_memory(url, handle = handle) :
## Peer certificate cannot be authenticated with given CA certificates: [uswest.ensembl.org] SSL certificate problem: certificate has expired
## See system.file("LICENSE", package="MotifDb") for use restrictions.
##
## groupGOTerms: GOBPTerm, GOMFTerm, GOCCTerm environments built.
theme_set(theme_bw())
set.seed(123)
BiocParallel::register(BPPARAM = BiocParallel::MulticoreParam(6))
load data tracks
bigwig_full <- list.files("../Tdg_wt/tracks",pattern = ".bw$",full.names = TRUE)
names(bigwig_full) <-c("TDG_KO1","TDG_KO2","TDG_KO3","Control1","Control2")
peaks
setwd("../Tdg_wt/peaks/peaks")
peaks_TDG_BP <- c(TDG_KO1_BP = rtracklayer::import("Kg_tdg_ko_1_EKDL220000948-1a_H22J7DSX3_L2_peaks.broadPeak"), TDG_KO2_BP = rtracklayer::import("Kg_tdg_ko_2_EKDL220003833-1a_HJ5T2DSX3_L1_peaks.broadPeak"),TDG_KO3_BP = rtracklayer::import("Kg_tdg_ko_3_EKDL220003834-1a_HJ5T2DSX3_L1_peaks.broadPeak"))
peaks_Ctrl_BP <- c(Control_1 = rtracklayer::import("Kg_tdg_M_1_EKDL220003835-1a_HJ5T2DSX3_L1_peaks.broadPeak"), Control_2 = rtracklayer::import("Kg_tdg_M_3_EKDL220003836-1a_HJ5T2DSX3_L1_peaks.broadPeak"))
consensus peaks
blacklist <- import("/reference/Mus_musculus/mm10.blacklist_chrM.bed")
blacklist
## GRanges object with 165 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chr1 24612621-24612850 *
## [2] chr1 48881431-48881690 *
## [3] chr1 58613871-58614090 *
## [4] chr1 78573921-78574140 *
## [5] chr1 88217961-88221950 *
## ... ... ... ...
## [161] chr9 24541941-24542200 *
## [162] chr9 35305121-35305620 *
## [163] chr9 110281191-110281400 *
## [164] chr9 123872951-123873160 *
## [165] chrM 2-16299 *
## -------
## seqinfo: 20 sequences from an unspecified genome; no seqlengths
seqlevelsStyle(blacklist) <- "ensembl"
get consensus peaks and remove blacklist
#total consensus peaks (more than in one sample)
x <- c(peaks_Ctrl_BP,peaks_TDG_BP)
total_consensus_peaks <- reduce(unlist(GRangesList(x)), with.revmap=TRUE)
y <- keepStandardChromosomes(total_consensus_peaks, pruning.mode = "coarse")
total_consensus_peaks <- granges(y[lengths(y$revmap)>1])
blacklist <- import("/reference/Mus_musculus/mm10.blacklist_chrM.bed")
seqlevelsStyle(blacklist) <- "ensembl"
total_consensus_peaks <- total_consensus_peaks[!overlapsAny(total_consensus_peaks, blacklist)]# thats how you get rid of blacklisted peaks!!!!
total_consensus_peaks
## GRanges object with 34791 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] 1 3193206-3194116 *
## [2] 1 3371834-3373144 *
## [3] 1 3670571-3672240 *
## [4] 1 4256517-4258768 *
## [5] 1 4326626-4326879 *
## ... ... ... ...
## [34787] Y 90806973-90807763 *
## [34788] Y 90810055-90813626 *
## [34789] Y 90819329-90819916 *
## [34790] Y 90825910-90828204 *
## [34791] Y 90835647-90836399 *
## -------
## seqinfo: 22 sequences from an unspecified genome; no seqlengths
# Tdg+/- consensus peaks (more than in one sample)
x <- c(peaks_TDG_BP)
TDG_consensus_peaks <- reduce(unlist(GRangesList(x)), with.revmap=TRUE)
y <- keepStandardChromosomes(TDG_consensus_peaks, pruning.mode = "coarse")
TDG_consensus_peaks <- granges(y[lengths(y$revmap)>1]) #for standard chromosomes use keep this line of code
blacklist <- import("/reference/Mus_musculus/mm10.blacklist_chrM.bed")
seqlevelsStyle(blacklist) <- "ensembl"
TDG_consensus_peaks <- TDG_consensus_peaks[!overlapsAny(TDG_consensus_peaks, blacklist)]# thats how you get rid of blacklisted peaks!!!!
TDG_consensus_peaks
## GRanges object with 7735 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] 1 3193206-3194116 *
## [2] 1 4256517-4258752 *
## [3] 1 4559822-4560973 *
## [4] 1 4766599-4766942 *
## [5] 1 4834934-4835688 *
## ... ... ... ...
## [7731] Y 90786836-90791666 *
## [7732] Y 90798938-90800732 *
## [7733] Y 90811286-90813626 *
## [7734] Y 90819329-90819916 *
## [7735] Y 90825910-90828112 *
## -------
## seqinfo: 22 sequences from an unspecified genome; no seqlengths
signal plots with trimmed tdg consensus peaks
TDG_consensus_peaks_2000 <- TDG_consensus_peaks[width(TDG_consensus_peaks)<=2000]
TDG_consensus_peaks_2000
## GRanges object with 7343 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] 1 3193206-3194116 *
## [2] 1 4559822-4560973 *
## [3] 1 4766599-4766942 *
## [4] 1 4834934-4835688 *
## [5] 1 4857457-4858101 *
## ... ... ... ...
## [7339] Y 90763634-90764058 *
## [7340] Y 90765485-90765707 *
## [7341] Y 90770497-90771246 *
## [7342] Y 90798938-90800732 *
## [7343] Y 90819329-90819916 *
## -------
## seqinfo: 22 sequences from an unspecified genome; no seqlengths
generate normalization factors for matrices
normfactors <- epiwraps::bwNormFactors(bigwig_full)
## Warning: bwNormFactors is deprecated, please gradually switch to
## `getNormFactors`.
## Comparing coverage in random regions...
generate matrices TDG consensus peaks
m_bp_TDG_2000 <- signal2Matrix(bigwig_full,regions = TDG_consensus_peaks_2000,w = 10L)
## Reading ../Tdg_wt/tracks/Kg_tdg_ko_1_EKDL220000948-1a_H22J7DSX3_L2.bw
## Reading ../Tdg_wt/tracks/Kg_tdg_ko_2_EKDL220003833-1a_HJ5T2DSX3_L1.bw
## Reading ../Tdg_wt/tracks/Kg_tdg_ko_3_EKDL220003834-1a_HJ5T2DSX3_L1.bw
## Reading ../Tdg_wt/tracks/Kg_tdg_M_1_EKDL220003835-1a_HJ5T2DSX3_L1.bw
## Reading ../Tdg_wt/tracks/Kg_tdg_M_3_EKDL220003836-1a_HJ5T2DSX3_L1.bw
m_bp_TDG_bg_2000 <- rescaleSignalMatrices(m_bp_TDG_2000,normfactors)
saveRDS(m_bp_TDG_bg_2000,"../object_resources/figure_2_meta/m_bp_TDG_bg_2000.rds")
#m_bp_TDG_bg_2000 <- readRDS("../object_resources/figure_2_meta/m_bp_TDG_bg_2000.rds")
aggregate matrix
sm1 <- list( wt =mergeSignalMatrices(m_bp_TDG_bg_2000[4:5],aggregation = "mean"), "Tdg +/-" =mergeSignalMatrices(m_bp_TDG_bg_2000[1:3],aggregation = "mean"))
generate cluster
set.seed(123) # to ensure that it gives the same results everytime
cl <- clusterSignalMatrices(m_bp_TDG_bg_2000, k=5)
## ~90% of the variance explained by clusters
mycolors <- c("1"=brewer.pal(n=9,"Set1")[1], "2"=brewer.pal(n=9,"Set1")[2],"3"=brewer.pal(n=9,"Set1")[3],"4"=brewer.pal(n=9,"Set1")[4],"5"=brewer.pal(n=9,"Set1")[5])
p1 <- plotEnrichedHeatmaps(sm1, row_title=gt_render("*Tdg +/-* accessible regions") ,scale_title = "backgr.\nnorm.\ndensity", colors = rocket(100),row_split = cl,mean_color = mycolors, scale_rows = FALSE,,column_title_gp=gpar(fontface = "italic") )
## Loading required namespace: gridtext
print(p1)
saveRDS(p1,"p1.rds")
p1 <- readRDS("p1.rds")
Import of Data blacklist still included
bigwigfiles_ctrl <- list.files("../Tdg_wt/tracks",pattern = "^Kg_tdg_M(.*)\\L.\\.bw$", full =TRUE)
bigwigfiles_TDG <- list.files("../Tdg_wt/tracks",pattern = "^Kg_tdg_ko(.*)\\L.\\.bw$", full =TRUE)
names(bigwigfiles_ctrl) <- c("Control_1", "Control_2")
names(bigwigfiles_TDG) <- c("TDG_KO1", "TDG_KO2","TDG_KO3")
setwd("../Tdg_wt/peaks/")
peaks_TDG_NF <- c(TDG_KO1_NF = rtracklayer::import("NFpeaks/Kg_tdg_ko_1_EKDL220000948-1a_H22J7DSX3_L2_peaks.narrowPeak"), TDG_KO2_NF = rtracklayer::import("NFpeaks/Kg_tdg_ko_2_EKDL220003833-1a_HJ5T2DSX3_L1_peaks.narrowPeak"),TDG_KO3_NF = rtracklayer::import("NFpeaks/Kg_tdg_ko_3_EKDL220003834-1a_HJ5T2DSX3_L1_peaks.narrowPeak"))
peaks_Ctrl_NF <- c(Control_1 = rtracklayer::import("NFpeaks/Kg_tdg_M_1_EKDL220003835-1a_HJ5T2DSX3_L1_peaks.narrowPeak"), Control_2 = rtracklayer::import("NFpeaks/Kg_tdg_M_3_EKDL220003836-1a_HJ5T2DSX3_L1_peaks.narrowPeak"))
blacklist
blacklist <- import("/reference/Mus_musculus/mm10.blacklist_chrM.bed")
blacklist
## GRanges object with 165 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chr1 24612621-24612850 *
## [2] chr1 48881431-48881690 *
## [3] chr1 58613871-58614090 *
## [4] chr1 78573921-78574140 *
## [5] chr1 88217961-88221950 *
## ... ... ... ...
## [161] chr9 24541941-24542200 *
## [162] chr9 35305121-35305620 *
## [163] chr9 110281191-110281400 *
## [164] chr9 123872951-123873160 *
## [165] chrM 2-16299 *
## -------
## seqinfo: 20 sequences from an unspecified genome; no seqlengths
seqlevelsStyle(blacklist) <- "ensembl"
#total consensus peaks (more than in one sample)
x <- c(peaks_Ctrl_NF,peaks_TDG_NF)
total_consensus_NFpeaks <- reduce(unlist(GRangesList(x)), with.revmap=TRUE)
y <- keepStandardChromosomes(total_consensus_NFpeaks, pruning.mode = "coarse")
total_consensus_NFpeaks <- granges(y[lengths(y$revmap)>1])
blacklist <- import("/reference/Mus_musculus/mm10.blacklist_chrM.bed")
seqlevelsStyle(blacklist) <- "ensembl"
total_consensus_NFpeaks <- total_consensus_NFpeaks[!overlapsAny(total_consensus_NFpeaks, blacklist)]# thats how you get rid of blacklisted peaks!!!!
total_consensus_NFpeaks
## GRanges object with 39905 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] 1 3372705-3372895 *
## [2] 1 3417800-3417913 *
## [3] 1 3441963-3442149 *
## [4] 1 4326614-4326823 *
## [5] 1 4513999-4514253 *
## ... ... ... ...
## [39901] Y 90799268-90799393 *
## [39902] Y 90800404-90800516 *
## [39903] Y 90803521-90803614 *
## [39904] Y 90811208-90811823 *
## [39905] Y 90811882-90813021 *
## -------
## seqinfo: 22 sequences from an unspecified genome; no seqlengths
# Tdg+/- consensus peaks (more than in one sample)
x <- c(peaks_TDG_NF)
TDG_consensus_NFpeaks <- reduce(unlist(GRangesList(x)), with.revmap=TRUE)
y <- keepStandardChromosomes(TDG_consensus_NFpeaks, pruning.mode = "coarse")
TDG_consensus_NFpeaks <- granges(y[lengths(y$revmap)>1]) #for standard chromosomes use keep this line of code
blacklist <- import("/reference/Mus_musculus/mm10.blacklist_chrM.bed")
seqlevelsStyle(blacklist) <- "ensembl"
TDG_consensus_NFpeaks <- TDG_consensus_NFpeaks[!overlapsAny(TDG_consensus_NFpeaks, blacklist)]# thats how you get rid of blacklisted peaks!!!!
TDG_consensus_NFpeaks
## GRanges object with 1003 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] 1 7088732-7088957 *
## [2] 1 7089023-7089173 *
## [3] 1 7397998-7398291 *
## [4] 1 9545227-9545430 *
## [5] 1 9748383-9748510 *
## ... ... ... ...
## [999] Y 90737643-90737897 *
## [1000] Y 90738088-90738308 *
## [1001] Y 90811268-90811711 *
## [1002] Y 90811882-90812139 *
## [1003] Y 90812150-90813021 *
## -------
## seqinfo: 22 sequences from an unspecified genome; no seqlengths
generate normalization factors
normfactors <- epiwraps::bwNormFactors(c(bigwigfiles_ctrl,bigwigfiles_TDG),)
## Warning: bwNormFactors is deprecated, please gradually switch to
## `getNormFactors`.
## Comparing coverage in random regions...
generate matrix for TDG consensus peaks
m_NP_TDG <- signal2Matrix(c(bigwigfiles_ctrl,bigwigfiles_TDG),regions = TDG_consensus_NFpeaks,w = 10L)
## Reading ../Tdg_wt/tracks/Kg_tdg_M_1_EKDL220003835-1a_HJ5T2DSX3_L1.bw
## Reading ../Tdg_wt/tracks/Kg_tdg_M_3_EKDL220003836-1a_HJ5T2DSX3_L1.bw
## Reading ../Tdg_wt/tracks/Kg_tdg_ko_1_EKDL220000948-1a_H22J7DSX3_L2.bw
## Reading ../Tdg_wt/tracks/Kg_tdg_ko_2_EKDL220003833-1a_HJ5T2DSX3_L1.bw
## Reading ../Tdg_wt/tracks/Kg_tdg_ko_3_EKDL220003834-1a_HJ5T2DSX3_L1.bw
m_NP_TDG_bg <- rescaleSignalMatrices(m_NP_TDG,normfactors)
saveRDS(m_NP_TDG_bg,"../object_resources/figure_2_meta/m_NP_TDG_bg.rds")
aggregate matrix
#m_NP_TDG_bg <- readRDS("../object_resources/figure_2_meta/m_NP_TDG_bg.rds")
sm2 <- list( wt =mergeSignalMatrices(m_NP_TDG_bg[1:2],aggregation = "mean"), "Tdg +/-" =mergeSignalMatrices(m_NP_TDG_bg[3:5],aggregation = "mean"))
p2 <- plotEnrichedHeatmaps(sm2, row_title=gt_render("*Tdg +/-* NFA regions"),scale_title = "backgr.\nnorm.\ndensity", colors = rocket(100),column_title_gp=gpar(fontface = "italic"),use_raster = TRUE)
print(p2)
saveRDS(p2,"p2.rds")
p2 <- readRDS("p2.rds")
bamfiles
bamfiles_ctrl <- list.files("../Tdg_wt/aligned",pattern = "^Kg_tdg_M(.*)\\.bam$", full =TRUE)
bamfiles_TDG <- list.files("../Tdg_wt/aligned",pattern = "^Kg_tdg_ko(.*)\\.bam$", full =TRUE)
names(bamfiles_ctrl) <- c("Control_1", "Control_2")
names(bamfiles_TDG) <- c("TDG_KO1", "TDG_KO2","TDG_KO3")
NFpeaks Import of Data blacklist still included
bigwigfiles_ctrl <- list.files("../Tdg_wt/tracks",pattern = "^Kg_tdg_M(.*)\\L.\\.bw$", full =TRUE)
bigwigfiles_TDG <- list.files("../Tdg_wt/tracks",pattern = "^Kg_tdg_ko(.*)\\L.\\.bw$", full =TRUE)
names(bigwigfiles_ctrl) <- c("Control_1", "Control_2")
names(bigwigfiles_TDG) <- c("TDG_KO1", "TDG_KO2","TDG_KO3")
setwd("../Tdg_wt/peaks/")
peaks_TDG_NF <- c(TDG_KO1_NF = rtracklayer::import("NFpeaks/Kg_tdg_ko_1_EKDL220000948-1a_H22J7DSX3_L2_peaks.narrowPeak"), TDG_KO2_NF = rtracklayer::import("NFpeaks/Kg_tdg_ko_2_EKDL220003833-1a_HJ5T2DSX3_L1_peaks.narrowPeak"),TDG_KO3_NF = rtracklayer::import("NFpeaks/Kg_tdg_ko_3_EKDL220003834-1a_HJ5T2DSX3_L1_peaks.narrowPeak"))
peaks_Ctrl_NF <- c(Control_1 = rtracklayer::import("NFpeaks/Kg_tdg_M_1_EKDL220003835-1a_HJ5T2DSX3_L1_peaks.narrowPeak"), Control_2 = rtracklayer::import("NFpeaks/Kg_tdg_M_3_EKDL220003836-1a_HJ5T2DSX3_L1_peaks.narrowPeak"))
blacklist
blacklist <- import("/reference/Mus_musculus/mm10.blacklist_chrM.bed")
blacklist
seqlevelsStyle(blacklist) <- "ensembl"
#total consensus peaks (more than in one sample)
x <- c(peaks_Ctrl_NF,peaks_TDG_NF)
total_consensus_NFpeaks <- reduce(unlist(GRangesList(x)), with.revmap=TRUE)
y <- keepStandardChromosomes(total_consensus_NFpeaks, pruning.mode = "coarse")
total_consensus_NFpeaks <- granges(y[lengths(y$revmap)>1])
blacklist <- import("/reference/Mus_musculus/mm10.blacklist_chrM.bed")
seqlevelsStyle(blacklist) <- "ensembl"
total_consensus_NFpeaks <- total_consensus_NFpeaks[!overlapsAny(total_consensus_NFpeaks, blacklist)]# thats how you get rid of blacklisted peaks!!!!
total_consensus_NFpeaks
# Tdg+/- consensus peaks (more than in one sample)
x <- c(peaks_TDG_NF)
TDG_consensus_NFpeaks <- reduce(unlist(GRangesList(x)), with.revmap=TRUE)
y <- keepStandardChromosomes(TDG_consensus_NFpeaks, pruning.mode = "coarse")
TDG_consensus_NFpeaks <- granges(y[lengths(y$revmap)>1]) #for standard chromosomes use keep this line of code
blacklist <- import("/reference/Mus_musculus/mm10.blacklist_chrM.bed")
seqlevelsStyle(blacklist) <- "ensembl"
TDG_consensus_NFpeaks <- TDG_consensus_NFpeaks[!overlapsAny(TDG_consensus_NFpeaks, blacklist)]# thats how you get rid of blacklisted peaks!!!!
TDG_consensus_NFpeaks
load motifs and check select non-redundant TFs
core <- read_meme(file = "../object_resources/HOCOMOCOv11_core_MOUSE_mono_meme_format.meme") # needs to be the right path
#full <- read_meme(file = "HOCOMOCOv11_full_MOUSE_mono_meme_format.meme")
# original function PL
summary_core <- universalmotif::summarise_motifs(core)
core <- setNames(object = core,nm = summary_core$name)
motifs <- core
modf <- data.frame(row.names=summary_core$name,
TF=gsub("_MOUSE.H11MO...+","",summary_core$name),
grade=gsub(".+\\.","",summary_core$name),
motif_rank = gsub("[^01]+","",gsub(".*MO.",replacement = "",summary_core$name))) #gsub("[^ro]+", "", str1)
modf <- modf[order(modf$TF,modf$grade,modf$motif_rank),]
modf <- modf[!duplicated(modf$TF),]
modf <- modf[!modf$grade == "D",]
motifs <- setNames(universalmotif::convert_motifs(motifs[row.names(modf)]), modf$TF)
import genome fasta file (GRCm38)
genome <- Rsamtools::FaFile("../object_resources/Mus_musculus.GRCm38.dna_sm.primary_assembly.fa")
genome
## class: FaFile
## path: ../object_resources/Mus_musculus.GRCm38.dna_sm.primary_assembly.fa
## index: ../object_resources.../Mus_musculus.GRCm38.dna_sm.primary_assembly.fa.fai
## gzindex: ../object_resourc.../Mus_musculus.GRCm38.dna_sm.primary_assembly.fa.gzi
## isOpen: FALSE
## yieldSize: NA
count reads in peaks
p <- resize(TDG_consensus_NFpeaks,width=250,fix="center")
sc_np <- chromVAR::getCounts(c(bamfiles_ctrl,bamfiles_TDG), p, paired=TRUE)
## Reading in file: ../Tdg_wt/aligned/Kg_tdg_M_1_EKDL220003835-1a_HJ5T2DSX3_L1.bam
## Reading in file: ../Tdg_wt/aligned/Kg_tdg_M_3_EKDL220003836-1a_HJ5T2DSX3_L1.bam
## Reading in file: ../Tdg_wt/aligned/Kg_tdg_ko_1_EKDL220000948-1a_H22J7DSX3_L2.bam
## Reading in file: ../Tdg_wt/aligned/Kg_tdg_ko_2_EKDL220003833-1a_HJ5T2DSX3_L1.bam
## Reading in file: ../Tdg_wt/aligned/Kg_tdg_ko_3_EKDL220003834-1a_HJ5T2DSX3_L1.bam
sc_np <- addGCBias(sc_np, genome=genome)
saveRDS(sc_np,"../object_resources/figure_2_meta/sc_np.rds")
#sc_np <- readRDS("../object_resources/figure_2_meta/sc_np.rds")
convert motifs and perfrom compute Deviations
set.seed(123)
motifs <- do.call(what = "PFMatrixList", lapply(motifs, class="TFBSTools-PFMatrix", FUN= convert_motifs)) #old way and new with mouse hocomocov11 changed to lapply(motifs) from
motif_ix <- matchMotifs(motifs, sc_np, genome=genome)
dev_np <- computeDeviations(object=sc_np, annotations=motif_ix, background_peaks = getBackgroundPeaks(sc_np,rowData(sc_np)$bias,niterations = 5000))
z-score distribution test
d <- dplyr::bind_rows(lapply(as.list(as.data.frame(as.matrix(assays(dev_np)$z))),FUN=function(x) as.data.frame(as.matrix(x))), .id="sample")
ggplot(d, aes(V1, colour=sample)) + geom_density() + xlab("z-score")
assays(dev_np)$norm <- scale(assays(dev_np)$z)
d <- dplyr::bind_rows(lapply(as.list(as.data.frame(as.matrix(assays(dev_np)$norm))),FUN=function(x) as.data.frame(as.matrix(x))), .id="sample")
ggplot(d, aes(V1, colour=sample)) + geom_density() + xlab("z-score")
saveRDS(dev_np,"../object_resources/figure_2_meta/dev_np.rds")
# differential analysis
dev_np <- readRDS("../object_resources/figure_2_meta/dev_np.rds")
dev_np$groups <- c("wt","wt","Tdg +/-","Tdg +/-","Tdg +/-")
dev_np$genotype <- c("wt","wt","Tdg +/-","Tdg +/-","Tdg +/-")
dev_np$groups <- relevel(factor(dev_np$groups), "wt")
fit <- limma::lmFit(assays(dev_np)$norm, model.matrix(~dev_np$groups))
head(mores1 <- topTable(eBayes(fit),number=Inf),400)
## Removing intercept from test coefficients
metadata(dev_np)$anno_colors <- list(genotype=c(wt ="darkgrey", "Tdg +/-"="darkred"))
colData(dev_np)
## DataFrame with 5 rows and 3 columns
## depth groups genotype
## <numeric> <factor> <character>
## Kg_tdg_M_1_EKDL220003835-1a_HJ5T2DSX3_L1.bam 113335120 wt wt
## Kg_tdg_M_3_EKDL220003836-1a_HJ5T2DSX3_L1.bam 109686923 wt wt
## Kg_tdg_ko_1_EKDL220000948-1a_H22J7DSX3_L2.bam 79111085 Tdg +/- Tdg +/-
## Kg_tdg_ko_2_EKDL220003833-1a_HJ5T2DSX3_L1.bam 74429079 Tdg +/- Tdg +/-
## Kg_tdg_ko_3_EKDL220003834-1a_HJ5T2DSX3_L1.bam 98555192 Tdg +/- Tdg +/-
generate heatmap for C
p3_2 <- sechm::sechm(dev_np[,order(dev_np$groups)], head(row.names(mores1),26), assayName = "norm", top_annotation="genotype", breaks=0.99,do.scale = "FALSE",name ="norm. z",gaps_at = "groups",show_annotation_legend = FALSE,row_names_gp = gpar(fontsize = 9),column_title_gp=gpar(fontface = "italic"))
p3_2
metadata(dev_np)
## $anno_colors
## $anno_colors$genotype
## wt Tdg +/-
## "darkgrey" "darkred"
generate volcano plot for C
highlight <- c("PRGR","ANDR","GCR")
w <- head(which(mores1$adj.P.Val < 0.05 & !( rownames(mores1) %in% highlight)),30) #& !( rownames(mores1) %in% highlight)
w2 <- mores1[rownames(mores1) %in% highlight,]
p3_1 <- ggplot(mores1, aes(logFC, -log10(adj.P.Val ))) +
geom_point() +
geom_point(data=w2,color = "darkred",size = 3,aes(logFC, -log10(adj.P.Val )))+
#geom_text_repel(data=cbind(gene=row.names(mores1)[w], mores1[w,]), color="black", aes(label=gene),max.overlaps = 20,size=3)+
geom_text_repel(data=cbind(gene=row.names(w2),w2), color="darkred", aes(label=gene),nudge_x = 1.5,nudge_y = 0.1)+
geom_hline(yintercept=-log10(0.05), linetype="dashed")+
geom_text(aes(-5.5,-log10(0.05),label = "adj. p-value = 0.05", vjust = 1))+
ggtitle(("Motif activity at accessible *Tdg +/-* sites"))+
theme(plot.title = ggtext::element_markdown())+
xlim(c(-7.5,9.5))+
xlab("activity norm. z-score logFC")
p3 <- plot_grid(p3_1, grid.grabExpr(draw(p3_2)), nrow=1, rel_heights=c(4,2), scale=0.95,ncol = 2)
p3
saveRDS(p3,"../object_resources/figure_2_meta/motif actiivity.rds")
p3 <- readRDS("../object_resources/figure_2_meta/motif actiivity.rds")
generate motifs of TFs of interest
load motifs and check select non-redundant TFs
core <- read_meme(file = "../object_resources/HOCOMOCOv11_core_MOUSE_mono_meme_format.meme") # needs to be the right path
#full <- read_meme(file = "HOCOMOCOv11_full_MOUSE_mono_meme_format.meme")
# original function PL
summary_core <- universalmotif::summarise_motifs(core)
core <- setNames(object = core,nm = summary_core$name)
motifs <- core
modf <- data.frame(row.names=summary_core$name,
TF=gsub("_MOUSE.H11MO...+","",summary_core$name),
grade=gsub(".+\\.","",summary_core$name),
motif_rank = gsub("[^01]+","",gsub(".*MO.",replacement = "",summary_core$name))) #gsub("[^ro]+", "", str1)
modf <- modf[order(modf$TF,modf$grade,modf$motif_rank),]
modf <- modf[!duplicated(modf$TF),]
modf <- modf[!modf$grade == "D",]
motifs <- setNames(universalmotif::convert_motifs(motifs[row.names(modf)]), modf$TF)
p3_ml <- plot_grid(
universalmotif::view_motifs(motifs$PRGR,)+geom_text(aes(x= 8, y= 1.75,,label = "PRGR"),size=3)+theme_void(),
universalmotif::view_motifs(motifs$GCR)+geom_text(aes(x= 8, y= 1.75,,label = "GCR"),size=3)+theme_void(),
universalmotif::view_motifs(motifs$ANDR)+geom_text(aes(x= 8, y= 1.75,,label = "ANDR"),size=3)+theme_void(),
nrow =1,
scale = 0.95,
ncol = 3
)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the ggseqlogo package.
## Please report the issue at <https://github.com/omarwagih/ggseqlogo/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
p3_ml
p3_both <- plot_grid(p3,p3_ml , nrow=2, rel_heights=c(6,1), scale=0.95 )
p3_both
se <- readRDS("../object_resources/RNAseq_mESC_tdg.SE.rds")
se$genotype
## [1] "wt" "wt" "het" "het" "ko" "ko" "ko" "wt" "het" "ko"
se$genotype <- relevel(factor(se$genotype), "wt")
#filter for genes with at least 20 counts
dds <- calcNormFactors(DGEList(assay(se)))
dds <- dds[filterByExpr(dds,model.matrix(~se$genotype),min.count=20),]
se <- se[row.names(dds),]
generation of surrogate variables
se <- SEtools::svacor(se, ~genotype)
## Using variance-stabilizing transformation
## converting counts to integer mode
## Number of significant surrogate variables is: 3
## Iteration (out of 5 ):1 2 3 4 5
form <- paste0("~genotype+",paste(grep("^SV[0-3]+",colnames(colData(se)),value=TRUE),collapse="+")) # there are 3 surrogate variables
mm <- model.matrix(as.formula(form), data=as.data.frame(colData(se)))
dds <- calcNormFactors(DGEList(assay(se)))
dds <- estimateDisp(dds,mm)
fit <- glmQLFit(dds, mm)
tcoefs <- grep("^genotype",colnames(mm),value=TRUE)
glrt <- as.data.frame(topTags(glmQLFTest(fit, tcoefs),Inf))
rowData(se)$global.FDR <- glrt[row.names(se),"FDR"]
for (x in tcoefs){
n <- gsub("genotype","DEA.",x)
y <- as.data.frame(topTags(glmQLFTest(fit, x),Inf))
rowData(se)[[n]] <- y[row.names(se),]
}
lapply(sechm::getDEA(se),FUN=function(x) hist(x$PValue))
## $het
## $breaks
## [1] 0.00 0.05 0.10 0.15 0.20 0.25 0.30 0.35 0.40 0.45 0.50 0.55 0.60 0.65 0.70
## [16] 0.75 0.80 0.85 0.90 0.95 1.00
##
## $counts
## [1] 4128 1294 904 758 578 553 477 451 415 387 386 407 378 335 345
## [16] 304 367 315 297 314
##
## $density
## [1] 6.1644142 1.9323527 1.3499589 1.1319346 0.8631375 0.8258045 0.7123124
## [8] 0.6734861 0.6197267 0.5779138 0.5764205 0.6077802 0.5644740 0.5002613
## [15] 0.5151945 0.4539685 0.5480475 0.4703950 0.4435153 0.4689017
##
## $mids
## [1] 0.025 0.075 0.125 0.175 0.225 0.275 0.325 0.375 0.425 0.475 0.525 0.575
## [13] 0.625 0.675 0.725 0.775 0.825 0.875 0.925 0.975
##
## $xname
## [1] "x$PValue"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## $ko
## $breaks
## [1] 0.00 0.05 0.10 0.15 0.20 0.25 0.30 0.35 0.40 0.45 0.50 0.55 0.60 0.65 0.70
## [16] 0.75 0.80 0.85 0.90 0.95 1.00
##
## $counts
## [1] 1735 1104 860 779 686 661 626 601 572 524 581 538 499 522 518
## [16] 549 549 514 475 500
##
## $density
## [1] 2.5909057 1.6486224 1.2842530 1.1632943 1.0244157 0.9870828 0.9348167
## [8] 0.8974838 0.8541776 0.7824983 0.8676174 0.8034048 0.7451654 0.7795117
## [15] 0.7735384 0.8198313 0.8198313 0.7675651 0.7093258 0.7466587
##
## $mids
## [1] 0.025 0.075 0.125 0.175 0.225 0.275 0.325 0.375 0.425 0.475 0.525 0.575
## [13] 0.625 0.675 0.725 0.775 0.825 0.875 0.925 0.975
##
## $xname
## [1] "x$PValue"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
calculation of log fold change
se <- SEtools::log2FC(se, "corrected", se$genotype=="wt", isLog = TRUE)
se <- se[,order(se$genotype)]
metadata(se)$default_view <- list(assay="log2FC", gaps_at="genotype",
top_annotation=c("wt"))
metadata(se)$default_view$groupvar <- metadata(se)$default_view$colvar <- NULL
pick top significantly diff expressed genes and heatmap
deas <- getDEA(se)
gdegs <- row.names(se)[head(order(rowData(se)$global.FDR),20)]
se$anno <- c("wt","wt","wt","Tdg +/-","Tdg +/-","Tdg +/-","ko","ko","ko","ko")
se$anno <- relevel(factor(se$anno), "wt")
metadata(se)$anno_colors <- list(genotype=c(wt="lightgrey", het="darkred"))
s1 <- sechm(se[,!se$genotype=="ko"], head(rownames(deas$het),20),assayName = "log2FC", gaps_at = "anno",do.scale = FALSE,top_annotation = "genotype",show_annotation_legend = FALSE,column_title_gp=gpar(fontface = "italic"))
#deasdeadegs <- lapply(deas, FUN=function(x) row.names(x)[x$FDR<0.05]) # check for genes passign the FDR
dea <- deas$het
dea$gene <- rownames(dea)
s2 <- ggplot(dea, aes(logFC, -log10(FDR))) +
geom_point() +
ggrepel::geom_text_repel(data=head(dea[dea$FDR<0.05 ,],15),size=4,aes(label=gene),max.overlaps = 20,nudge_y = 0.25)+
ggtitle("Differentially expressed genes in *Tdg +/-* mESC")+
xlim(c(-5,5))+
theme(plot.title = ggtext::element_markdown())
s2
## Warning: Removed 15 rows containing missing values (`geom_point()`).
ss <- plot_grid(s2,grid.grabExpr(draw(s1)),labels = c("A","B"),rel_widths = c(3,2))
## Warning: Removed 15 rows containing missing values (`geom_point()`).
ss
saveRDS(ss,"../object_resources/figure_2_meta/fig_S3.rds")
mreg <- readRDS("../object_resources/regulon.rds")
viper-limma
vi <- viper(assays(se)$corrected, mreg, pleiotropy=TRUE, verbose=FALSE)
vi <- SummarizedExperiment(list(viper=vi), colData=colData(se))
metadata(vi) <- metadata(se)
vi <- SEtools::log2FC(vi, "viper", vi$genotype=="wt", isLog=TRUE)
fit <- eBayes(lmFit(assay(vi), model.matrix(~vi$genotype,data=as.data.frame(colData(vi)))))
## Warning: Zero sample variances detected, have been offset away from zero
vires <- as.data.frame(topTable(fit, "vi$genotypehet", Inf))
rowData(vi)$DEA.viper <- vires[row.names(vi),]
vires$TF <- row.names(vires)
label1 <- c("ESR2","TEAD2","HXA9","CEBPD","PRDM1")
label2 <- "ANDR"
highlight <- "Ar"
w <- vires[rownames(vires) %in% highlight,]
p4_1 <- ggplot(vires, aes(logFC, -log10(adj.P.Val), label=TF)) +
geom_point() +
geom_point(data=w,color = "darkred",size = 3,aes(logFC, -log10(adj.P.Val )))+
ggrepel::geom_text_repel(data=head(vires[vires$adj.P.Val<0.001 &!(rownames(vires) %in% highlight) ,],5),aes(label = label1),nudge_x = -0.15,nudge_y = 1.75, min.segment.length=0,size=3)+
geom_text_repel(data=cbind(gene=rownames(w),w), color="darkred", aes(label=label2,nudge_x = 0.75,nudge_y = 5))+
ggtitle("Inferred TF activity in *Tdg +/-* of mESC")+
theme(plot.title = ggtext::element_markdown())+
xlab("log2FC")
## Warning in geom_text_repel(data = cbind(gene = rownames(w), w), color =
## "darkred", : Ignoring unknown aesthetics: nudge_x and nudge_y
p4_1
vi$genotype <- relevel(vi$genotype,"wt")
vi$anno <- c("wt","wt","wt","Tdg +/-","Tdg +/-","Tdg +/-","ko","ko","ko","ko")
#vi$anno <- c("wt","wt","Tdg +/-","Tdg +/-","ko","ko","ko","wt","Tdg +/-","ko")
vi$anno <- relevel(factor(vi$anno), "wt")
metadata(vi)$anno_colors <- list(genotype=c(wt="lightgrey", het="darkred"))
# test for order of TFs
sechm::sechm(vi[,!vi$genotype=="ko"], head(vires$TF,6), assayName="log2FC", top_annotation = "genotype",do.scale = FALSE,gaps_at = "genotype",show_annotation_legend = FALSE,column_title_gp=gpar(fontface = "italic"))
gene_labels <- c("PRDM1","ANDR","CEBPD","TEAD2","HXA9","ESR2")
p4_2 <- sechm::sechm(vi[,!vi$genotype=="ko"], head(vires$TF,6), assayName="log2FC", top_annotation = "genotype",do.scale = FALSE,gaps_at = "anno",show_annotation_legend = FALSE,column_title_gp=gpar(fontface = "italic"),row_labels = gene_labels)
p4_2
p4_2 <- plot_grid(p4_1, grid.grabExpr(draw(p4_2)), nrow=1, rel_heights=c(4,2), scale=0.95,ncol = 2)
p4_2
saveRDS(p4_2,"../object_resources/figure_2_meta/p4_2_inferred_motif_activity.rds")
pp1 <- plot_grid(grid.grabExpr(draw(p1)),grid.grabExpr(draw(p2)),ncol = 2,labels = c("A","B"))
pp2 <- plot_grid(pp1,p3_both,p4_2, nrow = 3, labels = c(NA,"C","D"),rel_heights = c(3,3,2))
# now add the title
title <- ggdraw() +
draw_label(
"Figure 2",
fontface = 'bold',
x = 0,
hjust = 0
) +
theme(
# add margin on the left of the drawing canvas,
# so title is aligned with left edge of first plot
plot.margin = margin(0, 0, 0, 7)
)
pp3 <- plot_grid(
title, pp2,
ncol = 1,
# rel_heights values control vertical title margins
rel_heights = c(0.1, 3)
)
## Warning: Removed 1 rows containing missing values (`geom_text()`).
pp3
pdf("figure_2_meta.pdf", width=11, height=12)
pp3
dev.off()
## png
## 2
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
##
## locale:
## [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
## [4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
## [7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
## [10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid parallel stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] knitr_1.42 viper_1.24.0
## [3] fgsea_1.16.0 topGO_2.42.0
## [5] SparseM_1.81 GO.db_3.12.1
## [7] AnnotationDbi_1.52.0 graph_1.68.0
## [9] rGREAT_1.22.0 sechm_1.9.2
## [11] SEtools_1.9.3 SummarizedExperiment_1.20.0
## [13] Biobase_2.50.0 MatrixGenerics_1.2.1
## [15] matrixStats_0.63.0 edgeR_3.32.1
## [17] cowplot_1.1.1 limma_3.46.0
## [19] ggrepel_0.9.3 ggplot2_3.4.2
## [21] viridis_0.6.2 viridisLite_0.4.1
## [23] RColorBrewer_1.1-3 AnnotationHub_2.22.1
## [25] BiocFileCache_1.14.0 dbplyr_2.1.1
## [27] TFBSTools_1.28.0 universalmotif_1.8.5
## [29] MotifDb_1.32.0 Biostrings_2.58.0
## [31] XVector_0.30.0 memes_0.99.11
## [33] motifmatchr_1.12.0 chromVAR_1.12.0
## [35] rtracklayer_1.50.0 epiwraps_0.99.62
## [37] EnrichedHeatmap_1.29.2 ComplexHeatmap_2.15.1
## [39] GenomicRanges_1.42.0 GenomeInfoDb_1.26.7
## [41] IRanges_2.24.1 S4Vectors_0.28.1
## [43] BiocGenerics_0.36.1
##
## loaded via a namespace (and not attached):
## [1] rappdirs_0.3.3 R.methodsS3_1.8.1
## [3] nabor_0.5.0 tidyr_1.2.0
## [5] bit64_4.0.5 DelayedArray_0.16.3
## [7] R.utils_2.11.0 data.table_1.14.8
## [9] rpart_4.1.16 KEGGREST_1.30.1
## [11] RCurl_1.98-1.6 AnnotationFilter_1.14.0
## [13] doParallel_1.0.17 generics_0.1.3
## [15] GenomicFeatures_1.42.3 RSQLite_2.2.11
## [17] proxy_0.4-26 bit_4.0.5
## [19] tzdb_0.3.0 xml2_1.3.3
## [21] httpuv_1.6.9 assertthat_0.2.1
## [23] DirichletMultinomial_1.32.0 xfun_0.38
## [25] hms_1.1.1 jquerylib_0.1.4
## [27] evaluate_0.20 promises_1.2.0.1
## [29] TSP_1.2-0 fansi_1.0.4
## [31] progress_1.2.2 caTools_1.18.2
## [33] DBI_1.1.3 geneplotter_1.68.0
## [35] htmlwidgets_1.5.4 purrr_1.0.1
## [37] ellipsis_0.3.2 dplyr_1.1.1
## [39] backports_1.4.1 V8_4.1.0
## [41] markdown_1.1 annotate_1.68.0
## [43] biomaRt_2.46.3 vctrs_0.6.1
## [45] Cairo_1.6-0 ensembldb_2.14.1
## [47] cachem_1.0.7 withr_2.5.0
## [49] Gviz_1.34.1 BSgenome_1.58.0
## [51] checkmate_2.1.0 GenomicAlignments_1.26.0
## [53] prettyunits_1.1.1 cluster_2.1.4
## [55] segmented_1.4-0 lazyeval_0.2.2
## [57] seqLogo_1.56.0 crayon_1.5.2
## [59] genefilter_1.72.1 labeling_0.4.2
## [61] pkgconfig_2.0.3 nlme_3.1-157
## [63] ProtGenerics_1.22.0 seriation_1.3.4
## [65] nnet_7.3-17 rlang_1.1.0
## [67] lifecycle_1.0.3 miniUI_0.1.1.1
## [69] registry_0.5-1 dichromat_2.0-0.1
## [71] Matrix_1.5-3 ggseqlogo_0.1
## [73] base64enc_0.1-3 GlobalOptions_0.1.2
## [75] png_0.1-7 rjson_0.2.21
## [77] bitops_1.0-7 splitstackshape_1.4.8
## [79] R.oo_1.24.0 KernSmooth_2.23-20
## [81] blob_1.2.2 shape_1.4.6
## [83] stringr_1.5.0 readr_2.1.2
## [85] jpeg_0.1-10 CNEr_1.26.0
## [87] scales_1.2.1 memoise_2.0.1
## [89] magrittr_2.0.3 plyr_1.8.8
## [91] zlibbioc_1.36.0 compiler_4.0.3
## [93] clue_0.3-60 DESeq2_1.30.1
## [95] Rsamtools_2.6.0 cli_3.6.1
## [97] htmlTable_2.4.0 Formula_1.2-5
## [99] MASS_7.3-56 mgcv_1.8-39
## [101] tidyselect_1.2.0 stringi_1.7.12
## [103] highr_0.10 yaml_2.3.7
## [105] askpass_1.1 locfit_1.5-9.4
## [107] latticeExtra_0.6-29 sass_0.4.1
## [109] VariantAnnotation_1.36.0 fastmatch_1.1-3
## [111] randomcoloR_1.1.0.1 tools_4.0.3
## [113] circlize_0.4.15 rstudioapi_0.13
## [115] TFMPvalue_0.0.8 foreach_1.5.2
## [117] foreign_0.8-84 gridExtra_2.3
## [119] farver_2.1.1 Rtsne_0.16
## [121] digest_0.6.31 BiocManager_1.30.20
## [123] ggtext_0.1.1 shiny_1.7.1
## [125] pracma_2.3.8 gridtext_0.1.4
## [127] Rcpp_1.0.10 BiocVersion_3.12.0
## [129] later_1.3.0 httr_1.4.2
## [131] biovizBase_1.38.0 kernlab_0.9-32
## [133] Rdpack_2.3 colorspace_2.1-0
## [135] XML_3.99-0.9 splines_4.0.3
## [137] plotly_4.10.0 xtable_1.8-4
## [139] jsonlite_1.8.4 poweRlaw_0.70.6
## [141] GenomicFiles_1.26.0 UpSetR_1.4.0
## [143] R6_2.5.1 Hmisc_4.6-0
## [145] pillar_1.9.0 htmltools_0.5.5
## [147] mime_0.12 glue_1.6.2
## [149] fastmap_1.1.1 DT_0.22
## [151] BiocParallel_1.24.1 class_7.3-21
## [153] interactiveDisplayBase_1.28.0 codetools_0.2-19
## [155] utf8_1.2.3 lattice_0.21-8
## [157] bslib_0.3.1 tibble_3.2.1
## [159] sva_3.38.0 mixtools_1.2.0
## [161] curl_5.0.0 gtools_3.9.4
## [163] magick_2.7.3 zip_2.2.0
## [165] openxlsx_4.2.5 openssl_2.0.0
## [167] survival_3.3-1 rmarkdown_2.13
## [169] munsell_0.5.0 e1071_1.7-9
## [171] GetoptLong_1.0.5 GenomeInfoDbData_1.2.4
## [173] iterators_1.0.14 reshape2_1.4.4
## [175] gtable_0.3.3 rbibutils_2.2.8